R Markdown

Task 1:Load and clean FacultySalaries_1995.csv file and Re-create the graph below…

View of the clean data

library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(janitor)
## 
## Attaching package: 'janitor'
## 
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(stringr)
library(ggpubr)
library(broom)
library(AICcmodavg)
library(knitr)
library(DT)
df<- read_csv("Data/FacultySalaries_1995.csv")%>% clean_names() %>%   
  pivot_longer(c(ends_with("salary")), names_to = "rank", values_to = "salary",names_prefix = "avg_") %>%
  pivot_longer(c(ends_with("comp")), names_to = "comp_type", values_to = "comp_amt") %>%
  pivot_longer(c(num_full_profs,num_assoc_profs,num_assist_profs), names_to = "faculty_type", values_to = "faculty_count") 
## Rows: 1161 Columns: 17
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (3): UnivName, State, Tier
## dbl (14): FedID, AvgFullProfSalary, AvgAssocProfSalary, AvgAssistProfSalary,...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df <- df %>% filter(!df$tier == "VIIB")

datatable(df, rownames = FALSE, filter="top", options = list(pageLength = 5, scrollX=T))
## Warning in instance$preRenderHook(instance): It seems your data is too big
## for client-side DataTables. You may consider server-side processing: https://
## rstudio.github.io/DT/server.html

#Graph

df %>%
  ggplot(aes(x=rank, y=salary,fill=rank))+
  geom_boxplot()+
  facet_wrap(~tier)+
  theme_minimal()+
  theme(axis.text.x = element_text(angle = 90))
## Warning: Removed 1152 rows containing non-finite values (`stat_boxplot()`).

TASK 2: ANOVA

twoway<-aov(salary~state+rank+tier, data = df)
summary(twoway)
##                Df    Sum Sq  Mean Sq F value Pr(>F)    
## state          50  51004114  1020082   284.7 <2e-16 ***
## rank            2 150993010 75496505 21071.8 <2e-16 ***
## tier            2  61397872 30698936  8568.3 <2e-16 ***
## Residuals   30113 107889768     3583                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 1152 observations deleted due to missingness

##TASK 3

df2 <- read_csv("Data/Juniper_Oils.csv")  %>% pivot_longer(c("alpha-pinene","para-cymene","alpha-terpineol","cedr-9-ene","alpha-cedrene","beta-cedrene","cis-thujopsene","alpha-himachalene","beta-chamigrene","cuparene","compound 1","alpha-chamigrene","widdrol","cedrol","beta-acorenol","alpha-acorenol","gamma-eudesmol","beta-eudesmol","alpha-eudesmol","cedr-8-en-13-ol","cedr-8-en-15-ol","compound 2","thujopsenal"),names_to = "chem_comp",values_to = "concentration") %>% clean_names()
## Rows: 34 Columns: 41
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): SampleID, Project, Amplicon, Tree_Species, Field_Office, BLM_Fire_...
## dbl (34): BurnYear, Latitude, Longitude, alpha-pinene, para-cymene, alpha-te...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df2,n=5)
## # A tibble: 5 × 20
##   sample_id    project   ampli…¹ tree_…² burn_…³ latit…⁴ longi…⁵ field…⁶ blm_f…⁷
##   <chr>        <chr>     <chr>   <chr>     <dbl>   <dbl>   <dbl> <chr>   <chr>  
## 1 LOG-16S-SL12 JuniperL… 16S     Junipe…    2018    41.6   -114. Salt_L… Ridge  
## 2 LOG-16S-SL12 JuniperL… 16S     Junipe…    2018    41.6   -114. Salt_L… Ridge  
## 3 LOG-16S-SL12 JuniperL… 16S     Junipe…    2018    41.6   -114. Salt_L… Ridge  
## 4 LOG-16S-SL12 JuniperL… 16S     Junipe…    2018    41.6   -114. Salt_L… Ridge  
## 5 LOG-16S-SL12 JuniperL… 16S     Junipe…    2018    41.6   -114. Salt_L… Ridge  
## # … with 11 more variables: tracking_number <chr>, yield_percent <dbl>,
## #   bolt_surface_area_cm2 <dbl>, raw_exit_holes_per_cm2 <dbl>,
## #   raw_exit_holes <dbl>, living_larvae <dbl>, chem_total <dbl>,
## #   chem_mean <dbl>, years_since_burn <dbl>, chem_comp <chr>,
## #   concentration <dbl>, and abbreviated variable names ¹​amplicon,
## #   ²​tree_species, ³​burn_year, ⁴​latitude, ⁵​longitude, ⁶​field_office,
## #   ⁷​blm_fire_name

##Task 4: Graph

df2 %>% ggplot(aes(x=years_since_burn,y=concentration)) + geom_smooth() + facet_wrap(~chem_comp,scales = "free_y")
## `geom_smooth()` using method = 'loess' and formula = 'y ~ x'

##5: generalized linear model

library(broom)

mod1 <- glm(data=df2,formula =  concentration ~  years_since_burn+ chem_comp)

df3 <- tidy(mod1,conf.level = TRUE)

df4<- df3 %>% filter(df3$p.value<.05) 

kable(x = df4,align = "c",caption ="Statistically significant compounds",col.names =c("compound", "estimate" ,"std.error","statistic", "p.value"))
Statistically significant compounds
compound estimate std.error statistic p.value
chem_compalpha-cedrene 10.623529 0.8993768 11.812101 0.0000000
chem_compbeta-cedrene 2.776471 0.8993768 3.087105 0.0020948
chem_compbeta-eudesmol 1.788235 0.8993768 1.988305 0.0471372
chem_compcedr-8-en-13-ol 5.552941 0.8993768 6.174210 0.0000000
chem_compcedrol 19.823529 0.8993768 22.041407 0.0000000
chem_compcis-thujopsene 21.298824 0.8993768 23.681759 0.0000000
chem_compcompound 1 1.817647 0.8993768 2.021007 0.0436301
chem_compcuparene 1.923529 0.8993768 2.138736 0.0327760
chem_compthujopsenal 1.788235 0.8993768 1.988305 0.0471372
chem_compwiddrol 6.135294 0.8993768 6.821718 0.0000000